if (rstudioapi::isAvailable()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')## [conflicted] Will prefer dplyr::intersect over any other package
cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')## Rows: 1560 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (26): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ = read_tsv("../data/Supplementary_Table_2_all_circRNAs.txt")## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val = read_tsv('../data/Supplementary_Table_4_precision_values.txt')## Rows: 29 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (16): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cqall_circmake count table
cont_table = cq %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group_median, n_db) %>%
unique() %>%
# only keep high abundant circRNAs
filter(count_group_median == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
# change nr of databases to binary
mutate(n_db = ifelse(is.na(n_db), 0, 1)) %>%
group_by(n_db) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-n_db)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("novel", "in_db")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## novel 15.83333 129.1667
## in_db 79.16667 645.8333
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 154.88, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 13.13017
make count table
cont_table = cq %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group_median, n_detected) %>%
unique() %>%
filter(count_group_median == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
# change nr of databases to binary
mutate(n_detected = ifelse(n_detected == 1, 0, 1)) %>%
group_by(n_detected) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-n_detected)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("unique", "multiple_tools")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## unique 7.752874 63.24713
## multiple_tools 87.247126 711.75287
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 329.96, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 58.72597
make count table
cont_table = cq %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group_median, nr_exons) %>%
unique() %>%
filter(count_group_median == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
# also remove the ones that do not have a exon annotation
filter(!is.na(nr_exons), !nr_exons == "ambiguous") %>%
mutate(exon_bin = ifelse(nr_exons == 1, 0, 1)) %>%
group_by(exon_bin) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
mutate(fail = ifelse(is.na(fail), 0, fail)) %>%
select(-exon_bin)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("single_exon", "multi_exons")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## single_exon 7.243243 126.7568
## multi_exons 30.756757 538.2432
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 15.451, df = 1, p-value = 8.466e-05
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 3.791616
cq_start = cq %>%
mutate(start_exon_nr = substr(start_match, 19, 27),
start_exon_nr = ifelse(substr(start_exon_nr, 1, 1) == '_',
substr(start_exon_nr, 2, 10),
start_exon_nr))
cq_start cont_table = cq_start %>%
filter(count_group_median == 'count ≥ 5') %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, start_exon_nr) %>%
unique() %>%
filter(!is.na(start_exon_nr)) %>%
mutate(exon_1 = ifelse(start_exon_nr == "exon_1", 1, 0)) %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(exon_1) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup()
cont_table=> not enough values
cont_table = cq %>%
filter(count_group_median == 'count ≥ 5',
!strand == 'unknown') %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ss_motif) %>%
unique() %>%
mutate(ss_can = ifelse(ss_motif == "AGNGT", 1, 0)) %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(ss_can) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-ss_can)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("non_cannonical", "cannonical")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## non_cannonical 11.39306 96.60694
## cannonical 61.60694 522.39306
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 42.447, df = 1, p-value = 7.261e-11
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 5.195424
cont_table = cq %>%
filter(count_group_median == "count ≥ 5") %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ENST) %>%
unique() %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail'),
ENST_group = ifelse(is.na(ENST), 'no_match', "match")) %>%
# change nr of databases to binary
group_by(ENST_group) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-ENST_group)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("match", "no_match")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## match 85.619977 699.38002
## no_match 9.380023 76.61998
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 203.19, df = 1, p-value < 2.2e-16
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 0.04700229
cont_table = cq %>%
filter(count_group_median == "count ≥ 5") %>%
left_join(read_tsv('../data/details/tool_annotation.txt')) %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, approach) %>%
filter(!approach == 'integrative') %>%
unique() %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(approach) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-approach)## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "tool"
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("candidate-based", "segmented read-based")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## candidate-based 21.6445 160.3555
## segmented read-based 71.3555 528.6445
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 8.4882, df = 1, p-value = 0.003575
OR
OR = (cont_table[1,2]/cont_table[1,1])/(cont_table[2,2]/cont_table[2,1])
OR## [1] 2.761315
cont_table = cq %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group_median) %>%
unique() %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(count_group_median) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-count_group_median)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## count < 5 30.84134 202.1587
## count ≥ 5 115.15866 754.8413
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 18.31, df = 1, p-value = 1.877e-05
OR
OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR## [1] 2.286003
tool_anno = read_tsv('../data/details/tool_annotation.txt')## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sens = read_tsv('../data/Supplementary_Table_5_sensitivity_values.txt')## Rows: 32 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (3): nr_detected, nr_expected, sensitivity
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
add annotation to sensitivity
sens_anno = sens %>%
left_join(tool_anno) %>%
select(-tool_lt) %>% ungroup() %>%
filter(count_group_median == 'count ≥ 5')## Joining, by = "tool"
sens_annowilcox.test(sensitivity ~ approach, data=sens_anno %>% filter(!approach == 'integrative')) ##
## Wilcoxon rank sum exact test
##
## data: sensitivity by approach
## W = 6, p-value = 0.1264
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ approach, data=sens_anno)##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by approach
## Kruskal-Wallis chi-squared = 3.3122, df = 2, p-value = 0.1909
wilcox.test(sensitivity ~ lin_annotation, data=sens_anno)## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: sensitivity by lin_annotation
## W = 24, p-value = 0.5902
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ strand_anno, data=sens_anno %>%
filter(!strand_anno == "no strand reported"))##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by strand_anno
## Kruskal-Wallis chi-squared = 3.9841, df = 3, p-value = 0.2632
wilcox_ss = wilcox.test(sensitivity ~ splicing, data=sens_anno) ## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
wilcox_ss##
## Wilcoxon rank sum test with continuity correction
##
## data: sensitivity by splicing
## W = 55, p-value = 0.002206
## alternative hypothesis: true location shift is not equal to 0
sens_anno %>%
group_by(splicing) %>%
summarise(mean_sens = mean(sensitivity))N = nrow(sens_anno)
N## [1] 16
Z = qnorm(wilcox_ss$p.value/2)
Z## [1] -3.061034
r = abs(Z)/sqrt(N)
r## [1] 0.7652585
library(rstatix)
library(ggpubr)
sens_anno %>% rstatix::wilcox_test(sensitivity ~ splicing)sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing)#sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing, ci = T)median_diff = tibble()
for (tool_can in sens_anno %>% filter(splicing == "canonical") %>% pull(tool)) {
sens_can = sens_anno %>% filter(tool == tool_can) %>% pull(sensitivity)
for (tool_non_can in sens_anno %>% filter(splicing == "non-canonical") %>% pull(tool)) {
sens_non_can = sens_anno %>% filter(tool == tool_non_can) %>% pull(sensitivity)
median_diff = median_diff %>%
bind_rows(tibble(tool_can, sens_can, tool_non_can, sens_non_can))
}
}
median_diff = median_diff %>%
mutate(sens_diff = sens_can - sens_non_can)
median_diffmedian_diff %>% pull(sens_diff) %>% median()## [1] 0.3845161
wilcox.test(sensitivity ~ BSJ_filter, data=sens_anno) ## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: sensitivity by BSJ_filter
## W = 28, p-value = 1
## alternative hypothesis: true location shift is not equal to 0
val_cor = val %>%
# use perc_compound_val
select(tool, count_group, perc_compound_val) %>%
# get the number of detected circRNAs for each cell line and tool, per count group
left_join(all_circ %>%
group_by(cell_line, tool, count_group) %>%
summarise(total_n_ut = n())) %>%
# calculate the theoretical nr of validated circ
mutate(theo_TP_all = perc_compound_val * total_n_ut) %>%
left_join(sens, by = c('count_group' = 'count_group_median', 'tool')) %>%
group_by(tool, count_group, perc_compound_val, sensitivity) %>%
summarize(theo_TP_all_cl = sum(theo_TP_all)) %>% ungroup()## `summarise()` has grouped output by 'cell_line', 'tool'. You can override using
## the `.groups` argument.
## Joining, by = c("tool", "count_group")
## `summarise()` has grouped output by 'tool', 'count_group', 'perc_compound_val'.
## You can override using the `.groups` argument.
val_corbelow 5
val_cor_tmp = val_cor %>% filter(count_group == "count < 5")
cor.test(val_cor_tmp$sensitivity, val_cor_tmp$theo_TP_all_cl, method = 'spearman')## Warning in cor.test.default(val_cor_tmp$sensitivity,
## val_cor_tmp$theo_TP_all_cl, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: val_cor_tmp$sensitivity and val_cor_tmp$theo_TP_all_cl
## S = 58.159, p-value = 0.0003236
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8402236
above 5
val_cor_tmp = val_cor %>% filter(count_group == "count ≥ 5")
cor.test(val_cor_tmp$sensitivity, val_cor_tmp$theo_TP_all_cl, method = 'spearman')## Warning in cor.test.default(val_cor_tmp$sensitivity,
## val_cor_tmp$theo_TP_all_cl, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: val_cor_tmp$sensitivity and val_cor_tmp$theo_TP_all_cl
## S = 112.6, p-value = 0.0003534
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7989279